package bswd;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * Another crawler to jw.nju.edu.cn.
 * Retrieve elements with Xpath.
 * Xpath reference: http://www.w3schools.com/xpath/default.asp
 * Benchmark:
 * 	test2: processed 3006 ms
 * 	this: processed 8582 ms
 * @author Kavinyao
 *
 */
/**
 * Why is this so slow? And the code is somewhat unreadable to me...
 */
public class AnotherJWCrawler {
	private static final int LIMIT = 16;
	private static final String DOMAIN = "http://jw.nju.edu.cn";
	private static final String JS = "function submit(){var  th=document.form1;th.action=\"down-file.do\";th.submit();}";
	
	public static void main(String[] args) {
		try{
		    final WebClient client = new WebClient(BrowserVersion.FIREFOX_3_6);
		    client.setJavaScriptEnabled(false);
		    System.out.println("Start connecting...");
            final HtmlPage page = client.getPage(DOMAIN + "/root/index.html");
            System.out.println("Successfully connected, start processing...");
            //get table elements with designated xpath
            List<HtmlElement> list = (List<HtmlElement>) page.getByXPath("//html//body//div[@id='main']//table[@width='1000px']//tbody//tr//td//table//tbody//tr//td//table//tbody//tr//td//table[@width='522']");;
            //System.out.println(list.size());
            //get the one with the list of news
            HtmlElement title_table = list.get(0);
            //get <a> elements with xpath
            list = (List<HtmlElement>) title_table.getByXPath(".//tr//td//div//table//tr//td//a");
            
            //the rest part is the same to test2
            
            File ff = new File("D:\\test3.txt");
            BufferedWriter writer = new BufferedWriter(new FileWriter(ff));
            
            HtmlPage tempPage;
            List<HtmlElement> tempList;
            List<HtmlElement> tempList2;
            HtmlElement elem;
            HtmlElement table;
            
            for(int i = 0;i < list.size() && i < LIMIT ;i++){
            	elem = list.get(i);
            	System.out.println("processing element " + (i));
            	System.out.println("element title: " + elem.getTextContent());
            	tempPage = client.getPage(DOMAIN + elem.getAttribute("href").substring(2));
            	System.out.println(tempPage.hashCode());
            	tempList = tempPage.getElementsByTagName("table");
            	//get the <table> element with news
            	table = tempList.get(9);
            	tempList2 = table.getElementsByTagName("td");
            	
            	writer.write(tempList2.get(0).getTextContent() + "\r\n");
            	writer.write(tempList2.get(1).getTextContent().replace("【","").replace("】","") + "\r\n");
            	
            	//get main body
            	table = tempList.get(10);
            	String content = table.getTextContent().replace(JS, "").replace("\n", "\r\n");
            	writer.write(content);
            	writer.write("\r\n");
            }
            System.out.println("All done...");
            writer.close();
        }catch(Exception e){
            System.out.println(e.getMessage());
        }
	}

}
